knitr::opts_chunk$set(
  collapse = TRUE,
  eval=FALSE,
  comment = "#>"
)

documents_to_latex<-function(out_docs, include_description=TRUE, include_ocr=TRUE, pandoc_output=FALSE){
  for (doc in 1:nrow(out_docs)){
    if(pandoc_output){
      cat("\n\n")
      cat(paste0("## ", reportRx::sanitizestr(out_docs[doc, "title"]), " (id: ", out_docs[doc, "id"], ")"))
      cat("\n\n")
    } else {
      cat(paste0("\\subsection{", reportRx::sanitizestr(out_docs[doc, "title"]), " (id: ", out_docs[doc, "id"], ")}"))
      cat(paste0("  \n"))
    }
    if(include_description){
      if(pandoc_output){      
        cat("\n\n")
        cat(paste0("### description"))
        cat("\n\n")
      } else {
        cat(paste0("\\subsubsection{description}"))
        cat(paste0("  \n"))
      }
      #cat(Hmisc::latexTranslate(out_docs[doc, "description"]))
      #knitr::knit_print(out_docs[doc, "description"])
      cat(reportRx::sanitizestr(out_docs[doc, "description"]))
      cat("\n\n")
    }
    if(include_ocr){
      if(pandoc_output){
        cat("\n\n")
        cat(paste0("### OCR"))
        cat("\n\n")
      } else {
        cat(paste0("\\subsubsection{OCR}"))
        cat(paste0("  \n"))
      }

      #knitr::knit_print(out_docs[doc, "ocr"])
      cat(reportRx::sanitizestr(out_docs[doc, "ocr"]))
      if(pandoc_output){
        cat("  ")
      } else {
        cat("   \n")
      }
    }
  }
}

Introduction

This vignette covers the process of article classification using machine learning.

library(durhamevp)

## also using tidyverse functions
library(tidyverse)

Obtain Classified and Unclassified Document Sets and Relate to Systematic Searches

alldocs<-get_document("all")
alldocs<-alldocs %>%
  dplyr::mutate(std_url = sub("download/", "", url))

docallocs<-get_allocation("all", allocation_type = "coding")

docallocs<-left_join(docallocs, alldocs, by=c("document_id"="id"))

classdocs<-filter(docallocs, coding_complete==1, electoralviolence_nature %in% c("false", "true"))
classdocs<-classdocs %>%
  dplyr::mutate(std_url = sub("download/", "", url)) %>%
  dplyr::mutate(unclass=0, classified=1) %>%
  dplyr::mutate(EV_article=ifelse(electoralviolence_nature=="true", 1, 0))


systematic_search_terms<-c("incident",
                           "riot",
                           "mob",
                           "rough",
                           "disturbance",
                           "husting",
                           "adjourn",
                           "magistrate",
                           "police",
                           "prison",
                           "candidate",
                           "election",
                           "party")
all_searches<-get_archivesearches()
systematic_searches<-all_searches%>%
  dplyr::select(id, search_text, archive_date_start, archive_date_end) %>%
  filter(search_text %in% systematic_search_terms)

results_systematic_searches<-get_archivesearchresults(archive_search_id = systematic_searches$id) %>%
  left_join(all_searches, by=c("archive_search_id"="id")) %>%
  mutate(std_url = sub("download/", "", url))


is_classified <-results_systematic_searches$std_url %in% classdocs$std_url

classified_results<-results_systematic_searches[is_classified,]

Train Keyword Classifier

class_res_dfm<-durhamevp::searches_to_dfm(classified_results)
the_urls<-quanteda::docvars(class_res_dfm, "std_url")
ordered_cr<-classified_results[match(quanteda::docvars(class_res_dfm, "std_url"), classified_results$std_url),]

summary_class <- classdocs %>%
  group_by(std_url, document_id) %>%
  mutate(EV_article=electoralviolence_nature=="true") %>%
  summarize(EV_article=max(EV_article), sum_EV_article=sum(EV_article), n=n())

ordered_classified<-summary_class[match(quanteda::docvars(class_res_dfm, "std_url"), summary_class$std_url),]

quanteda::docvars(class_res_dfm, "EV_article")<-ordered_classified$EV_article

# use 85 per cent of the classified data to train classifier
# the other 15 per cent are used to as the testing set
n_train <- round(nrow(class_res_dfm)*.85)
the_sets<-split_dfm(class_res_dfm, n_train = n_train)
testing_urls<-quanteda::docvars(the_sets$testing_set, "std_url")
training_urls<-quanteda::docvars(the_sets$training_set, "std_url")

# train xgboost keyword classifier
classifier<-durhamevp::evp_classifiers(the_sets$training_set, "xgboost", "EV_article", "uniform")

# train naive bayes keyword classifier
classifier_nb<-durhamevp::evp_classifiers(the_sets$training_set, "nb", "EV_article", "uniform")

Evaluate Classifier on Test Set

Note this step should be regularly implemented to check that performance of classifier is improving not degrading. However, it is not necessary for the process.

# add predictions to the training and testing sets
quanteda::docvars(the_sets$training_set, "predicted_keywords")<-predict(classifier, newdata = the_sets$training_set, type="class")
quanteda::docvars(the_sets$testing_set, "predicted_keywords")<-predict(classifier, newdata = the_sets$testing_set, type="class")
quanteda::docvars(the_sets$testing_set, "predicted_keywords_nb")<-predict(classifier_nb, newdata = the_sets$testing_set, type="prob")[,2]

quanteda::docvars(the_sets$testing_set, "pred_class_keywords")<-factor(as.numeric(quanteda::docvars(the_sets$testing_set, "predicted_keywords")>.5))

results_table<-as.data.frame.matrix(caret::confusionMatrix(data=quanteda::docvars(the_sets$testing_set, "pred_class_keywords"), reference=factor(quanteda::docvars(the_sets$testing_set, "EV_article")), mode="prec_recall", positive="1")$table)

## **** probably delete****
tmp_train<-quanteda::docvars(the_sets$training_set)
p1<-ggplot(tmp_train, aes(predicted_keywords, EV_article))+
  geom_point(position=position_jitter(height=.1))+
  stat_smooth(method="glm", method.args = list(family="binomial"))+
  ggtitle("performance of the xgboost keyword classifier on the training set")

tmp_test<-quanteda::docvars(the_sets$testing_set)
p2<-ggplot(tmp_test, aes(predicted_keywords, EV_article))+
  geom_point(position=position_jitter(height=.1))+
  stat_smooth(method="glm", method.args = list(family="binomial"))+
  ggtitle("performance of the xgboost keyword classifier on the testing set")
p2

p3<-ggplot(tmp_test, aes(predicted_keywords_nb, EV_article))+
  geom_point(position=position_jitter(height=.1))+
  stat_smooth(method="glm", method.args = list(family="binomial"))+
  ggtitle("performance of the naive bayes keyword classifier on the testing set")
print(p3)
p4<-ggplot(tmp_test, aes(y=EV_article))+
  geom_point(colour="red", position=position_jitter(height=.1), aes(x=predicted_keywords_nb))+
  stat_smooth(method="glm",colour="red",  method.args = list(family="binomial"), aes(x=predicted_keywords_nb))+
  geom_point(colour="green", position=position_jitter(height=.1), aes(x=predicted_keywords))+
  stat_smooth(method="glm",colour="green",  method.args = list(family="binomial"), aes(x=predicted_keywords))+
  ggtitle("performance of the naive bayes keyword classifier on the testing set")

Use Classifier on Unclassified Cases

We do not examine status 1 or status 3 documents.

undownloaded_candidates<-get_candidate_documents(status =c("0", "2", "4", "5", "6", "7", "8", ""), include_ocr=FALSE)
undownloaded_candidates<- undownloaded_candidates %>%
  dplyr::mutate(std_url = sub("download/", "", url))

need_classifying <-results_systematic_searches$std_url %in% undownloaded_candidates$std_url

toclassify_results<-results_systematic_searches[need_classifying,]

toclass_res_dfm<-durhamevp::searches_to_dfm(toclassify_results)
classify_urls<-quanteda::docvars(toclass_res_dfm, "std_url")

toclass_othervars <- toclassify_results %>%
  select(newspaper_title, publication_date, publication_location, type, std_url) %>%
  group_by(std_url,newspaper_title, publication_date, publication_location, type) %>%
  summarize ()

ordered_tcov<-toclass_othervars[match(quanteda::docvars(toclass_res_dfm, "std_url"), toclass_othervars$std_url),]

quanteda::docvars(toclass_res_dfm, "publication_date")<-ordered_tcov$publication_date
quanteda::docvars(toclass_res_dfm, "publication_location")<-ordered_tcov$publication_location
quanteda::docvars(toclass_res_dfm, "newspaper_title")<-ordered_tcov$newspaper_title
quanteda::docvars(toclass_res_dfm, "type")<-ordered_tcov$type

# this step reorders the dfm to have the columns in the same order as the training_set dfm
toclass_res_dfm<-quanteda::dfm_select(toclass_res_dfm, the_sets$training_set)

# this step actually implements the prediction
quanteda::docvars(toclass_res_dfm, "predicted_keywords")<-predict(classifier, newdata = toclass_res_dfm, type="class")

From Document Feature Matrix Results to Candidate Documents Results

The results set contains many references to the same document (perhaps with different descriptions). The next step is to identify the unique candidate documents which these results relate to. Add the classification (where this is a subset the classification is always 1)

undownloaded_candidates_with_kwpred<-left_join(undownloaded_candidates, quanteda::docvars(toclass_res_dfm), by=c("std_url", "publication_date", "publication_location", "type"))

# add election name
undownloaded_candidates_with_kwpred<-undownloaded_candidates_with_kwpred %>%
  group_by(publication_date) %>%
  summarise() %>%
  mutate(election_name=date_to_election(publication_date)) %>%
  right_join(undownloaded_candidates_with_kwpred, by="publication_date")

##  If working by election subset to 1874 (or other desired election)

kwclass_1874<-undownloaded_candidates_with_kwpred %>%
  filter(election_name=="1874") %>%
  arrange(-predicted_keywords)

hist(kwclass_1874$predicted_keywords)


kwclass_1874_top5000<-kwclass_1874 %>% 
  slice(1:5000) %>%
  select(id, url, publication_title, description, status, g_status, title, status_writer, predicted_keywords)

kwclass_1874_top5000$status<-1
kwclass_1874_top5000$status_writer<-"xgboost_keyword"

Save results

to_csv<-kwclass_1874_top5000[,c("id", "url", "publication_title", "description", "status", "g_status", "title", "status_writer", "predicted_keywords")]

csv_filename<-gsub(" ", "_", paste0("britishnewspaperarchive_", format(Sys.time(), "%Y_%m_%d_%H_%M_%S"), ".csv"))

write.csv(to_csv, file=csv_filename, row.names = FALSE)

Download OCR

Then Download OCR of Documents Identified by Keyword Classifier

Classify OCR: Train Classifier

#classdocs$ocr
classified_corpus<-quanteda::corpus(classdocs[,c("ocr", "EV_article")], text_field = "ocr")
ocr_class_dfm<-durhamevp::preprocess_corpus(classified_corpus, min_termfreq=20, min_docfreq=20, stem=FALSE)
ocr_sets<-durhamevp::split_dfm(ocr_class_dfm, round(nrow(ocr_class_dfm)*.85))
ocr_classifier<-durhamevp::evp_classifiers(ocr_sets$training_set, classifier_type = "xgboost", training_classify_var = "EV_article")

quanteda::docvars(ocr_sets$testing_set, "ocr_predict_xgb")<-predict(ocr_classifier, ocr_sets$testing_set)
quanteda::docvars(ocr_sets$testing_set, "ocr_predict_class_xgb")<-as.numeric(quanteda::docvars(ocr_sets$testing_set, "ocr_predict_xgb")>.5)

ggplot(quanteda::docvars(ocr_sets$testing_set), aes(ocr_predict_xgb, EV_article))+
  geom_point(position="jitter")+
  geom_smooth(method="glm", method.args=list(family="binomial"))
caret::confusionMatrix(factor(quanteda::docvars(ocr_sets$testing_set, "EV_article")), factor(quanteda::docvars(ocr_sets$testing_set, "ocr_predict_class_xgb")), mode="prec_recall")

Use Classifier on New Material

kwclass_1874_top5000$id[1:5]
needs_ocr_classify<-get_candidate_documents(cand_document_id = kwclass_1874_top5000$id, include_ocr = TRUE)

needs_classify_corpus<-quanteda::corpus(needs_ocr_classify, text_field = "ocr")
needs_classify_dfm<-durhamevp::preprocess_corpus(needs_classify_corpus, min_termfreq=20, min_docfreq=20, stem=FALSE)
needs_classify_dfm<-quanteda::dfm_select(needs_classify_dfm, ocr_sets$training_set)

# this step actually implements the prediction
quanteda::docvars(needs_classify_dfm, "predicted_ocr")<-predict(ocr_classifier, newdata = needs_classify_dfm, type="class")

hist(quanteda::docvars(needs_classify_dfm, "predicted_ocr"))

classified_by_ocr<-quanteda::docvars(needs_classify_dfm)
aa<-left_join(classified_by_ocr, select(kwclass_1874_top5000, id, url, predicted_keywords)) %>%
  dplyr::mutate(std_url = sub("download/", "", url)) %>%
  arrange(-predicted_ocr)

ggplot(aa, aes(predicted_ocr))+
  geom_histogram()

ggplot(aa, aes(predicted_keywords, predicted_ocr))+
  geom_point()+
  stat_smooth(method="lm")

bb<-aa[aa$predicted_keywords>.7&aa$predicted_ocr<.1,]

f1<-lm(predicted_ocr~predicted_keywords, aa)
stargazer::stargazer(f1, type="text")
cor(aa$predicted_keywords, aa$predicted_ocr)


importance <- xgboost::xgb.importance(feature_names = colnames(ocr_sets$training_set), model = ocr_classifier)

Save results

download_these<-classified_by_ocr[,c("id", "url", "publication_title", "description", "status", "g_status", "title", "status_writer", "predicted_ocr")] %>%
  filter(predicted_ocr>.75) %>%
  mutate(status=1, status_writer="xgb_ocr") %>%
  as_tibble()


csv_filename<-gsub(" ", "_", paste0("britishnewspaperarchive_", format(Sys.time(), "%Y_%m_%d_%H_%M_%S"), ".csv"))

write.csv(download_these, file=csv_filename, row.names = FALSE)

Some more evaluation code

top_ones<-quanteda::docvars(toclass_res_dfm, "predicted_keywords")>.9
quanteda::docvars(toclass_res_dfm, "std_url")[top_ones][1:5]

as_df<-as.tbl(quanteda::docvars(toclass_res_dfm)) %>%
  mutate(pubyear=lubridate::year(publication_date))

#system.time(as_df <- as_df %>%
#  mutate(election=date_to_election(publication_date)))

as_df<-as_df %>%
  group_by(publication_date) %>%
  summarise() %>%
  mutate(election_name=date_to_election(publication_date)) %>%
  right_join(as_df, by="publication_date") 
pp1<-as_df %>%
  left_join(election_dates, by="election_name") %>%
  filter(keywords_13_complete=="Y") %>%
  ggplot(aes(election_name, predicted_keywords, group=election_name)) +
  geom_boxplot()

pp1<-as_df %>%
  left_join(election_dates, by="election_name") %>%
  filter(keywords_13_complete=="Y") %>%
  ggplot(aes(election_name, predicted_keywords, group=election_name)) +
  geom_violin(trim=FALSE)

as_df <- as_df %>%
  mutate(electionn=as.numeric(election))

prob_class<-function(x){
  case_when(
    x>.75 ~ "highly likely (>.75)",
    x>.5  ~ "likely (>.5 & <.75)",
    x>.25 ~ "unlikely (<.5 & >.25)",
    x<.25 ~ "very unlikely(<.25)",
    TRUE ~ NA_character_
  )
}
pp4<-as_df %>%
  mutate(prob_EV_keywords=prob_class(predicted_keywords)) %>%
  mutate(electionn=as.numeric(election_name)) %>%
  filter(electionn<1875) %>%
  group_by(election_name, electionn, prob_EV_keywords) %>%
  tally() %>%
  ggplot(aes(electionn, n))+
  facet_wrap(~prob_EV_keywords, scales="free")+
  geom_point()+
  geom_line()+
  ylim(0, NA) +
  ggtitle("Number of Currently Undownloaded Articles in Each Election by EV_Article Likelihood (Keyword Classifier)")

pp2<-as_df %>%
  group_by(electionn) %>%
  tally()%>%
  ggplot(aes(electionn, n)) +
  geom_point() +
  geom_line()
pp2

dim(results_systematic_searches)
rss<-results_systematic_searches %>%
  group_by(publication_date) %>%
  summarise() %>%
  mutate(election_name=date_to_election(publication_date)) %>%
  right_join(results_systematic_searches, by="publication_date")
pp3<-rss %>%
  group_by(election_name, search_text) %>%
  tally()%>%
  mutate(electionn=as.numeric(election_name)) %>%
  left_join(durhamevp::election_dates, by="election_name") %>%
  mutate(articles_per_day=n/monthsearch_duration) %>%
  filter(keywords_13_complete=="Y") %>%
  ggplot(aes(electionn, articles_per_day)) +
  facet_wrap(~search_text, scales="free")+
  geom_point() +
  geom_line() +
  ylim(0, NA) +
  ggtitle("Articles (per day) Containing 13 Systematic Search Terms During Month Around Elections")+
  theme_bw()
pp3
#link to searchers

candocs<-get_candidate_documents(status =c("0","1", "2", "4", "5", "6", "7", "8"), include_ocr=FALSE)

candocs<-candocs %>%
  dplyr::mutate(std_url = sub("download/", "", url))
candocs$EV_article<-ifelse(candocs$g_status %in% c("1", "3"), 1, 0)


candocs_training_set<-candocs[!candocs$std_url %in% testing_urls, ]
candocs_testing_set<-candocs[candocs$std_url %in% testing_urls, ]
descript_test<-classified_results[classified_results$std_url %in% testing_urls,]

select_descript_xgb<-classifier_selection_description(candocs_training_set, descript_test, classifier_type="xgboost", return_logical = TRUE, logical_to_prob=TRUE, stem=FALSE, min_docfreq=20, min_termfreq=20)

descript_test <- descript_test %>%
  mutate(xgb_select=select_descript_xgb)

compare_docs<-descript_test%>%
  inner_join(candocs_testing_set, by="description")

table(compare_docs$xgb_select>.5, compare_docs$g_status)

ggplot(compare_docs, aes(xgb_select, as.numeric(g_status==1)))+
  geom_point(position="jitter")+
  stat_smooth(method="glm", method.args = list(family="binomial"))+
  ggtitle("Performance of the xgboost descrition classifier on the testing set")

compare_docs<-descript_test%>%
  group_by(std_url) %>%
  summarise(maxselect=max(xgb_select)) %>%
  left_join(candocs_testing_set, by="std_url")

check_descript_xgb<-classifier_selection_description(candocs_training_set, candocs_training_set, classifier_type="xgboost", return_logical = TRUE, logical_to_prob=TRUE, stem=FALSE, min_docfreq=50, min_termfreq=50)

descript_test <- candocs_training_set %>%
  mutate(xgb_select=check_descript_xgb)

compare_docs<-descript_test%>%
  inner_join(candocs_training_set, by="description")

table(compare_docs$xgb_select>.5, compare_docs$g_status.x)

ggplot(compare_docs, aes(xgb_select, as.numeric(g_status.x=="1")))+
  geom_point(position="jitter")+
  stat_smooth(method="glm", method.args = list(family="binomial"))+
  ggtitle("Performance of the xgboost descrition classifier on the training set")

View classification results

aa<-left_join(needs_ocr_classify, classified_by_ocr, by=c("id", "url")) %>%
  filter(predicted_ocr>.75)
documents_to_latex(sample_n(aa, 30), include_ocr=TRUE, pandoc_output=TRUE)


gidonc/durhamevp documentation built on April 8, 2022, 10:31 a.m.